3 Cluster analysis

3.1 Cluster prevalence

Number of strategies each cluster was captured in.

cluster_prevalence <- cluster_counts %>%
  filter(read_count>0) %>%
  group_by(secondary_cluster) %>%
  summarise(n_strategy = n_distinct(overall_strategy)) %>%
  arrange(desc(n_strategy))

#Arranged by phylum and cluster prevalence
cluster_prevalence_phylum <- cluster_prevalence %>%
  left_join(cluster_taxonomy,by="secondary_cluster")

3.2 Cluster frequency

Number of clusters each strategy recovered.

cluster_number <- cluster_counts %>%
  filter(read_count>0) %>%
  group_by(overall_strategy) %>%
  summarise(n = n_distinct(secondary_cluster)) %>%
  arrange(desc(n)) %>%
  mutate(overall_strategy=factor(overall_strategy,levels=rev(overall_strategy)))

cluster_number %>%
  rename(Strategy=overall_strategy) %>%
  select(Strategy,n) %>%
  tt()
tinytable_6nr2caxhuvqwyavjssqo
Strategy n
coassembly_timepoint_all 176
coassembly_animal 163
coassembly_timepoint_cage 160
multicoverage_animal 130
multicoverage_timepoint_cage 126
single_coverage 117
multisplit_timepoint_all 115
multisplit_timepoint_cage 110
multisplit_animal 99
multicoverage_timepoint_all 93
cluster_number %>%
  filter(overall_strategy != "cobinning_treatment") %>%
  ggplot(aes(x = n, y = overall_strategy)) +
  geom_col(color="#666666") +
  coord_cartesian(xlim = c(90,200))+
  theme_classic() +
  theme() +
  labs(x="Number of clusters", y="Strategy")

3.3 Cluster heatmap

Overview of clusters per strategy.

cluster_counts %>%
  left_join(cluster_taxonomy,by="secondary_cluster") %>%
  mutate(Phylum=ifelse(is.na(Phylum),"Bacillota_A",Phylum)) %>%
  mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label))) %>%
  mutate(overall_strategy=factor(overall_strategy,levels=rev(cluster_number$overall_strategy)))  %>%
  group_by(overall_strategy, secondary_cluster) %>%
  sample_n(1) %>%
  ungroup() %>%
  ggplot(aes(x = secondary_cluster, y = overall_strategy, fill=Phylum)) +
  geom_tile(color="#ffffff") +
  scale_fill_manual(values=phylum_colors) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.title.y = element_blank()) +
  xlab("MAG clusters")

3.4 Cluster abundance

Maximum read count

maximum_read_count <- cluster_counts %>%
  group_by(secondary_cluster) %>%
  slice_max(order_by = read_count) %>%
  ungroup()
maximum_read_count %>%
  mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label))) %>%
  ggplot(aes(x = secondary_cluster, y = read_count)) +
  geom_col(color="#666666") +
  theme_classic() +
  theme(axis.text.x = element_blank()) +
  labs(x="MAG clusters", y="Number of reads")

3.5 MAG mapping

Average percentage of read mapped in each strategy against the respective catalogue.

all_counts <- read_delim("data/all_counts.tsv", col_names = c("sample", "total_count")) %>%
  mutate(total_count = total_count * 2)

mag_mapping <- bin_counts %>%
  left_join(all_counts, by = join_by(sample)) %>%
  summarise(mag_mapping = sum(read_count) / mean(total_count), .by = c(sample, overall_strategy)) %>%
  summarise(mag_mapping = mean(mag_mapping), .by = overall_strategy) %>%
  mutate(overall_strategy=factor(overall_strategy,levels=rev(cluster_number$overall_strategy)))

mag_mapping %>%
  filter(!is.na(overall_strategy)) %>%
  rename(Strategy=overall_strategy,`Mapping %`=mag_mapping) %>%
  mutate(`Mapping %`=`Mapping %`*100) %>%
  tt()
tinytable_whdy7s5s8q7xo4pet01u
Strategy Mapping %
single_coverage 80.25097
multicoverage_animal 79.78157
multicoverage_timepoint_all 79.68070
multicoverage_timepoint_cage 80.48701
coassembly_animal 80.91453
coassembly_timepoint_all 82.53138
coassembly_timepoint_cage 80.11122
multisplit_animal 68.33532
multisplit_timepoint_all 83.70626
multisplit_timepoint_cage 77.11125
mag_mapping %>%
  filter(overall_strategy != "cobinning_treatment") %>%
  ggplot(aes(x = mag_mapping, y = overall_strategy)) +
  geom_col(color="#666666") +
  coord_cartesian(xlim = c(0.6,0.9))+
  theme_classic() +
  theme() +
  labs(x="Mapping percentage", y="Strategy")

3.6 Domain-adjusted mapping rates (DAMR)

#import SingleM estimates
smf_files <- list.files(path = "data/singlem/", 
                        pattern = "*tsv.gz", 
                        full.names = T)

smf_estimates <- purrr::map(smf_files, read_delim) %>%
  bind_rows()

mean_smf <- smf_estimates %>%
  mutate(read_fraction = as.numeric(str_replace(read_fraction, "%", ""))) %>%
  pull(read_fraction) %>%
  mean()

mag_mapping %>%
  filter(!is.na(overall_strategy)) %>%
  rename(Strategy=overall_strategy) %>%
  mutate(mag_mapping = mag_mapping * 100,
         DAMR = (mag_mapping / mean_smf) * 100,
         DAMR = scales::number(if_else(DAMR > 100, 100, DAMR),
                               accuracy = 0.1)
         ) %>%
  select(-mag_mapping) %>%
  tt()
tinytable_5vzqzp7v6acalzxuagov
Strategy DAMR
single_coverage 100.0
multicoverage_animal 100.0
multicoverage_timepoint_all 100.0
multicoverage_timepoint_cage 100.0
coassembly_animal 100.0
coassembly_timepoint_all 100.0
coassembly_timepoint_cage 100.0
multisplit_animal 87.9
multisplit_timepoint_all 100.0
multisplit_timepoint_cage 99.2

3.7 Read counts vs prevalence

maximum_read_count %>%
  left_join(cluster_prevalence,by=join_by(secondary_cluster==secondary_cluster)) %>% 
  left_join(cluster_taxonomy,by="secondary_cluster") %>%
  select(read_count, n_strategy, Phylum) %>%
  ggplot(aes(x = read_count, y = n_strategy, group=n_strategy, color=Phylum)) +
      geom_boxplot(color="#999999", fill="#f4f4f4", outlier.shape = NA) +
      scale_y_continuous(breaks=seq(1,10,1))+
      scale_color_manual(values=phylum_colors) +
      geom_jitter() +
      theme_classic() +
      theme() +
      labs(x="Sequencing depth", y="Strategy prevalence")